Importing libraries needed

In [1]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Loading dataset

In [2]:
df = pd.read_csv("PEP1.csv")

Visualising some data in dataset

In [3]:
df.head()
Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

In [4]:
# Dropping Id column 
df.drop(['Id'], axis=1,inplace=True)

Information about the dataset

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchebvGr     1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functiol       1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

Identifying the shape of the dataset

In [6]:
df.shape
Out[6]:
(1460, 80)

pd.set_option("display.max_rows",None)

Pandas doesn't print all columns by default. If you want to permanently set Pandas print options to display the output of all columns. you can use any of these options for a TEMPORARY solution:

Identifying variables with null values and plotting them

In [7]:
null_values = pd.isnull(df).sum()[pd.isnull(df).sum()>0]
null_values
Out[7]:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
In [8]:
def plot_nas(df: pd.DataFrame):
    if df.isnull().sum().sum() != 0:
        na_df = (df.isnull().sum() / len(df))      
        na_df = na_df.drop(na_df[na_df == 0].index).sort_values(ascending=False)
        missing_data = pd.DataFrame({'Variables with null values' :na_df})
        missing_data.plot(kind = "barh")
        plt.show()
    else:
        print('No NAs found')
plot_nas(df)

Identifying variables with unique values and plotting them

In [9]:
for col in df:
    print(col, df[col].nunique())
MSSubClass 15
MSZoning 5
LotFrontage 110
LotArea 1073
Street 2
Alley 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
OverallQual 10
OverallCond 9
YearBuilt 112
YearRemodAdd 61
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
MasVnrArea 327
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinSF1 637
BsmtFinType2 6
BsmtFinSF2 144
BsmtUnfSF 780
TotalBsmtSF 721
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
1stFlrSF 753
2ndFlrSF 417
LowQualFinSF 24
GrLivArea 861
BsmtFullBath 4
BsmtHalfBath 3
FullBath 4
HalfBath 3
BedroomAbvGr 8
KitchebvGr 4
KitchenQual 4
TotRmsAbvGrd 12
Functiol 7
Fireplaces 4
FireplaceQu 5
GarageType 6
GarageYrBlt 97
GarageFinish 3
GarageCars 5
GarageArea 441
GarageQual 5
GarageCond 5
PavedDrive 3
WoodDeckSF 274
OpenPorchSF 202
EnclosedPorch 120
3SsnPorch 20
ScreenPorch 76
PoolArea 8
PoolQC 3
Fence 4
MiscFeature 4
MiscVal 21
MoSold 12
YrSold 5
SaleType 9
SaleCondition 6
SalePrice 663

Generate a separate dataset for numerical and categorical variables

In [10]:
nu = df.nunique().reset_index()
nu.columns = ['feature','nunique']
plt.figure(figsize=(7,20))
ax = sns.barplot(y='feature', x='nunique', data=nu)

Generating Seprate Dataset ---> (numerical data and categorical data)

In [4]:
numeric_data = df.select_dtypes(include=[np.number])
In [12]:
numeric_data.head(3)
Out[12]:
MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
0 60 65.0 8450 7 5 2003 2003 196.0 706 0 ... 0 61 0 0 0 0 0 2 2008 208500
1 20 80.0 9600 6 8 1976 1976 0.0 978 0 ... 298 0 0 0 0 0 0 5 2007 181500
2 60 68.0 11250 7 5 2001 2002 162.0 486 0 ... 0 42 0 0 0 0 0 9 2008 223500

3 rows × 37 columns

In [13]:
# Identifying shape of the dataset
numeric_data.shape
Out[13]:
(1460, 37)
In [5]:
categorical_data = df.select_dtypes(exclude=[np.number])
In [15]:
categorical_data.head(3)
Out[15]:
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition
0 RL Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
1 RL Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
2 RL Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal

3 rows × 43 columns

Dealing with numerical data

Identifying any missing values in numerical data

In [16]:
pd.isnull(numeric_data).sum()[pd.isnull(numeric_data).sum()>0]
Out[16]:
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

Missing value treatment using mean

In [6]:
numeric_data['LotFrontage'].fillna(numeric_data['LotFrontage'].mean(), inplace = True)
numeric_data['MasVnrArea'].fillna(numeric_data['MasVnrArea'].mean(), inplace = True)
numeric_data['GarageYrBlt'].fillna(numeric_data['GarageYrBlt'].mean(), inplace = True)
In [7]:
# Checking if there is any null value present
pd.isnull(numeric_data).sum().sum()
Out[7]:
0

Identify the skewness and distribution and Poltting it

In [8]:
skew=numeric_data.skew(axis = 0, skipna = True)
print(skew)
Id                0.000000
MSSubClass        1.407657
LotFrontage       2.384950
LotArea          12.207688
OverallQual       0.216944
OverallCond       0.693067
YearBuilt        -0.613461
YearRemodAdd     -0.503562
MasVnrArea        2.676412
BsmtFinSF1        1.685503
BsmtFinSF2        4.255261
BsmtUnfSF         0.920268
TotalBsmtSF       1.524255
1stFlrSF          1.376757
2ndFlrSF          0.813030
LowQualFinSF      9.011341
GrLivArea         1.366560
BsmtFullBath      0.596067
BsmtHalfBath      4.103403
FullBath          0.036562
HalfBath          0.675897
BedroomAbvGr      0.211790
KitchebvGr        4.488397
TotRmsAbvGrd      0.676341
Fireplaces        0.649565
GarageYrBlt      -0.668175
GarageCars       -0.342549
GarageArea        0.179981
WoodDeckSF        1.541376
OpenPorchSF       2.364342
EnclosedPorch     3.089872
3SsnPorch        10.304342
ScreenPorch       4.122214
PoolArea         14.828374
MiscVal          24.476794
MoSold            0.212053
YrSold            0.096269
SalePrice         1.882876
dtype: float64
In [20]:
for col in numeric_data:
    print(col,skew[col])
    plt.figure()
    sns.distplot(numeric_data[col])
    plt.show()
MSSubClass 1.4076567471495591
LotFrontage 2.3849501679427925
LotArea 12.207687851233496
OverallQual 0.2169439277628693
OverallCond 0.6930674724842182
YearBuilt -0.613461172488183
YearRemodAdd -0.5035620027004709
MasVnrArea 2.676411784729959
BsmtFinSF1 1.685503071910789
BsmtFinSF2 4.255261108933303
BsmtUnfSF 0.9202684528039037
TotalBsmtSF 1.5242545490627664
1stFlrSF 1.3767566220336365
2ndFlrSF 0.8130298163023265
LowQualFinSF 9.011341288465387
GrLivArea 1.3665603560164552
BsmtFullBath 0.596066609663168
BsmtHalfBath 4.103402697955168
FullBath 0.036561558402727165
HalfBath 0.675897448233722
BedroomAbvGr 0.21179009627507137
KitchebvGr 4.488396777072859
TotRmsAbvGrd 0.6763408364355531
Fireplaces 0.6495651830548841
GarageYrBlt -0.6681748227563908
GarageCars -0.3425489297486655
GarageArea 0.17998090674623907
WoodDeckSF 1.5413757571931312
OpenPorchSF 2.3643417403694404
EnclosedPorch 3.08987190371177
3SsnPorch 10.304342032693112
ScreenPorch 4.122213743143115
PoolArea 14.828373640750588
MiscVal 24.476794188821916
MoSold 0.21205298505146022
YrSold 0.09626851386568028
SalePrice 1.8828757597682129

Identifying significant variables using a correlation matrix

In [9]:
corr = numeric_data.corr()
corr.style.background_gradient(cmap='twilight_shifted').set_precision(2)
Out[9]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchebvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
Id 1.00 0.01 -0.01 -0.03 -0.03 0.01 -0.01 -0.02 -0.05 -0.01 -0.01 -0.01 -0.02 0.01 0.01 -0.04 0.01 0.00 -0.02 0.01 0.01 0.04 0.00 0.03 -0.02 0.00 0.02 0.02 -0.03 -0.00 0.00 -0.05 0.00 0.06 -0.01 0.02 0.00 -0.02
MSSubClass 0.01 1.00 -0.36 -0.14 0.03 -0.06 0.03 0.04 0.02 -0.07 -0.07 -0.14 -0.24 -0.25 0.31 0.05 0.07 0.00 -0.00 0.13 0.18 -0.02 0.28 0.04 -0.05 0.08 -0.04 -0.10 -0.01 -0.01 -0.01 -0.04 -0.03 0.01 -0.01 -0.01 -0.02 -0.08
LotFrontage -0.01 -0.36 1.00 0.31 0.23 -0.05 0.12 0.08 0.18 0.22 0.04 0.12 0.36 0.41 0.07 0.04 0.37 0.09 -0.01 0.18 0.05 0.24 -0.01 0.32 0.24 0.06 0.27 0.32 0.08 0.14 0.01 0.06 0.04 0.18 0.00 0.01 0.01 0.33
LotArea -0.03 -0.14 0.31 1.00 0.11 -0.01 0.01 0.01 0.10 0.21 0.11 -0.00 0.26 0.30 0.05 0.00 0.26 0.16 0.05 0.13 0.01 0.12 -0.02 0.19 0.27 -0.02 0.15 0.18 0.17 0.08 -0.02 0.02 0.04 0.08 0.04 0.00 -0.01 0.26
OverallQual -0.03 0.03 0.23 0.11 1.00 -0.09 0.57 0.55 0.41 0.24 -0.06 0.31 0.54 0.48 0.30 -0.03 0.59 0.11 -0.04 0.55 0.27 0.10 -0.18 0.43 0.40 0.52 0.60 0.56 0.24 0.31 -0.11 0.03 0.06 0.07 -0.03 0.07 -0.03 0.79
OverallCond 0.01 -0.06 -0.05 -0.01 -0.09 1.00 -0.38 0.07 -0.13 -0.05 0.04 -0.14 -0.17 -0.14 0.03 0.03 -0.08 -0.05 0.12 -0.19 -0.06 0.01 -0.09 -0.06 -0.02 -0.31 -0.19 -0.15 -0.00 -0.03 0.07 0.03 0.05 -0.00 0.07 -0.00 0.04 -0.08
YearBuilt -0.01 0.03 0.12 0.01 0.57 -0.38 1.00 0.59 0.31 0.25 -0.05 0.15 0.39 0.28 0.01 -0.18 0.20 0.19 -0.04 0.47 0.24 -0.07 -0.17 0.10 0.15 0.78 0.54 0.48 0.22 0.19 -0.39 0.03 -0.05 0.00 -0.03 0.01 -0.01 0.52
YearRemodAdd -0.02 0.04 0.08 0.01 0.55 0.07 0.59 1.00 0.18 0.13 -0.07 0.18 0.29 0.24 0.14 -0.06 0.29 0.12 -0.01 0.44 0.18 -0.04 -0.15 0.19 0.11 0.62 0.42 0.37 0.21 0.23 -0.19 0.05 -0.04 0.01 -0.01 0.02 0.04 0.51
MasVnrArea -0.05 0.02 0.18 0.10 0.41 -0.13 0.31 0.18 1.00 0.26 -0.07 0.11 0.36 0.34 0.17 -0.07 0.39 0.09 0.03 0.28 0.20 0.10 -0.04 0.28 0.25 0.25 0.36 0.37 0.16 0.12 -0.11 0.02 0.06 0.01 -0.03 -0.01 -0.01 0.48
BsmtFinSF1 -0.01 -0.07 0.22 0.21 0.24 -0.05 0.25 0.13 0.26 1.00 -0.05 -0.50 0.52 0.45 -0.14 -0.06 0.21 0.65 0.07 0.06 0.00 -0.11 -0.08 0.04 0.26 0.15 0.22 0.30 0.20 0.11 -0.10 0.03 0.06 0.14 0.00 -0.02 0.01 0.39
BsmtFinSF2 -0.01 -0.07 0.04 0.11 -0.06 0.04 -0.05 -0.07 -0.07 -0.05 1.00 -0.21 0.10 0.10 -0.10 0.01 -0.01 0.16 0.07 -0.08 -0.03 -0.02 -0.04 -0.04 0.05 -0.09 -0.04 -0.02 0.07 0.00 0.04 -0.03 0.09 0.04 0.00 -0.02 0.03 -0.01
BsmtUnfSF -0.01 -0.14 0.12 -0.00 0.31 -0.14 0.15 0.18 0.11 -0.50 -0.21 1.00 0.42 0.32 0.00 0.03 0.24 -0.42 -0.10 0.29 -0.04 0.17 0.03 0.25 0.05 0.19 0.21 0.18 -0.01 0.13 -0.00 0.02 -0.01 -0.04 -0.02 0.03 -0.04 0.21
TotalBsmtSF -0.02 -0.24 0.36 0.26 0.54 -0.17 0.39 0.29 0.36 0.52 0.10 0.42 1.00 0.82 -0.17 -0.03 0.45 0.31 -0.00 0.32 -0.05 0.05 -0.07 0.29 0.34 0.31 0.43 0.49 0.23 0.25 -0.10 0.04 0.08 0.13 -0.02 0.01 -0.01 0.61
1stFlrSF 0.01 -0.25 0.41 0.30 0.48 -0.14 0.28 0.24 0.34 0.45 0.10 0.32 0.82 1.00 -0.20 -0.01 0.57 0.24 0.00 0.38 -0.12 0.13 0.07 0.41 0.41 0.23 0.44 0.49 0.24 0.21 -0.07 0.06 0.09 0.13 -0.02 0.03 -0.01 0.61
2ndFlrSF 0.01 0.31 0.07 0.05 0.30 0.03 0.01 0.14 0.17 -0.14 -0.10 0.00 -0.17 -0.20 1.00 0.06 0.69 -0.17 -0.02 0.42 0.61 0.50 0.06 0.62 0.19 0.07 0.18 0.14 0.09 0.21 0.06 -0.02 0.04 0.08 0.02 0.04 -0.03 0.32
LowQualFinSF -0.04 0.05 0.04 0.00 -0.03 0.03 -0.18 -0.06 -0.07 -0.06 0.01 0.03 -0.03 -0.01 0.06 1.00 0.13 -0.05 -0.01 -0.00 -0.03 0.11 0.01 0.13 -0.02 -0.03 -0.09 -0.07 -0.03 0.02 0.06 -0.00 0.03 0.06 -0.00 -0.02 -0.03 -0.03
GrLivArea 0.01 0.07 0.37 0.26 0.59 -0.08 0.20 0.29 0.39 0.21 -0.01 0.24 0.45 0.57 0.69 0.13 1.00 0.03 -0.02 0.63 0.42 0.52 0.10 0.83 0.46 0.22 0.47 0.47 0.25 0.33 0.01 0.02 0.10 0.17 -0.00 0.05 -0.04 0.71
BsmtFullBath 0.00 0.00 0.09 0.16 0.11 -0.05 0.19 0.12 0.09 0.65 0.16 -0.42 0.31 0.24 -0.17 -0.05 0.03 1.00 -0.15 -0.06 -0.03 -0.15 -0.04 -0.05 0.14 0.12 0.13 0.18 0.18 0.07 -0.05 -0.00 0.02 0.07 -0.02 -0.03 0.07 0.23
BsmtHalfBath -0.02 -0.00 -0.01 0.05 -0.04 0.12 -0.04 -0.01 0.03 0.07 0.07 -0.10 -0.00 0.00 -0.02 -0.01 -0.02 -0.15 1.00 -0.05 -0.01 0.05 -0.04 -0.02 0.03 -0.08 -0.02 -0.02 0.04 -0.03 -0.01 0.04 0.03 0.02 -0.01 0.03 -0.05 -0.02
FullBath 0.01 0.13 0.18 0.13 0.55 -0.19 0.47 0.44 0.28 0.06 -0.08 0.29 0.32 0.38 0.42 -0.00 0.63 -0.06 -0.05 1.00 0.14 0.36 0.13 0.55 0.24 0.47 0.47 0.41 0.19 0.26 -0.12 0.04 -0.01 0.05 -0.01 0.06 -0.02 0.56
HalfBath 0.01 0.18 0.05 0.01 0.27 -0.06 0.24 0.18 0.20 0.00 -0.03 -0.04 -0.05 -0.12 0.61 -0.03 0.42 -0.03 -0.01 0.14 1.00 0.23 -0.07 0.34 0.20 0.19 0.22 0.16 0.11 0.20 -0.10 -0.00 0.07 0.02 0.00 -0.01 -0.01 0.28
BedroomAbvGr 0.04 -0.02 0.24 0.12 0.10 0.01 -0.07 -0.04 0.10 -0.11 -0.02 0.17 0.05 0.13 0.50 0.11 0.52 -0.15 0.05 0.36 0.23 1.00 0.20 0.68 0.11 -0.06 0.09 0.07 0.05 0.09 0.04 -0.02 0.04 0.07 0.01 0.05 -0.04 0.17
KitchebvGr 0.00 0.28 -0.01 -0.02 -0.18 -0.09 -0.17 -0.15 -0.04 -0.08 -0.04 0.03 -0.07 0.07 0.06 0.01 0.10 -0.04 -0.04 0.13 -0.07 0.20 1.00 0.26 -0.12 -0.11 -0.05 -0.06 -0.09 -0.07 0.04 -0.02 -0.05 -0.01 0.06 0.03 0.03 -0.14
TotRmsAbvGrd 0.03 0.04 0.32 0.19 0.43 -0.06 0.10 0.19 0.28 0.04 -0.04 0.25 0.29 0.41 0.62 0.13 0.83 -0.05 -0.02 0.55 0.34 0.68 0.26 1.00 0.33 0.14 0.36 0.34 0.17 0.23 0.00 -0.01 0.06 0.08 0.02 0.04 -0.03 0.53
Fireplaces -0.02 -0.05 0.24 0.27 0.40 -0.02 0.15 0.11 0.25 0.26 0.05 0.05 0.34 0.41 0.19 -0.02 0.46 0.14 0.03 0.24 0.20 0.11 -0.12 0.33 1.00 0.05 0.30 0.27 0.20 0.17 -0.02 0.01 0.18 0.10 0.00 0.05 -0.02 0.47
GarageYrBlt 0.00 0.08 0.06 -0.02 0.52 -0.31 0.78 0.62 0.25 0.15 -0.09 0.19 0.31 0.23 0.07 -0.03 0.22 0.12 -0.08 0.47 0.19 -0.06 -0.11 0.14 0.05 1.00 0.48 0.48 0.22 0.22 -0.29 0.02 -0.08 -0.01 -0.03 0.01 -0.00 0.47
GarageCars 0.02 -0.04 0.27 0.15 0.60 -0.19 0.54 0.42 0.36 0.22 -0.04 0.21 0.43 0.44 0.18 -0.09 0.47 0.13 -0.02 0.47 0.22 0.09 -0.05 0.36 0.30 0.48 1.00 0.88 0.23 0.21 -0.15 0.04 0.05 0.02 -0.04 0.04 -0.04 0.64
GarageArea 0.02 -0.10 0.32 0.18 0.56 -0.15 0.48 0.37 0.37 0.30 -0.02 0.18 0.49 0.49 0.14 -0.07 0.47 0.18 -0.02 0.41 0.16 0.07 -0.06 0.34 0.27 0.48 0.88 1.00 0.22 0.24 -0.12 0.04 0.05 0.06 -0.03 0.03 -0.03 0.62
WoodDeckSF -0.03 -0.01 0.08 0.17 0.24 -0.00 0.22 0.21 0.16 0.20 0.07 -0.01 0.23 0.24 0.09 -0.03 0.25 0.18 0.04 0.19 0.11 0.05 -0.09 0.17 0.20 0.22 0.23 0.22 1.00 0.06 -0.13 -0.03 -0.07 0.07 -0.01 0.02 0.02 0.32
OpenPorchSF -0.00 -0.01 0.14 0.08 0.31 -0.03 0.19 0.23 0.12 0.11 0.00 0.13 0.25 0.21 0.21 0.02 0.33 0.07 -0.03 0.26 0.20 0.09 -0.07 0.23 0.17 0.22 0.21 0.24 0.06 1.00 -0.09 -0.01 0.07 0.06 -0.02 0.07 -0.06 0.32
EnclosedPorch 0.00 -0.01 0.01 -0.02 -0.11 0.07 -0.39 -0.19 -0.11 -0.10 0.04 -0.00 -0.10 -0.07 0.06 0.06 0.01 -0.05 -0.01 -0.12 -0.10 0.04 0.04 0.00 -0.02 -0.29 -0.15 -0.12 -0.13 -0.09 1.00 -0.04 -0.08 0.05 0.02 -0.03 -0.01 -0.13
3SsnPorch -0.05 -0.04 0.06 0.02 0.03 0.03 0.03 0.05 0.02 0.03 -0.03 0.02 0.04 0.06 -0.02 -0.00 0.02 -0.00 0.04 0.04 -0.00 -0.02 -0.02 -0.01 0.01 0.02 0.04 0.04 -0.03 -0.01 -0.04 1.00 -0.03 -0.01 0.00 0.03 0.02 0.04
ScreenPorch 0.00 -0.03 0.04 0.04 0.06 0.05 -0.05 -0.04 0.06 0.06 0.09 -0.01 0.08 0.09 0.04 0.03 0.10 0.02 0.03 -0.01 0.07 0.04 -0.05 0.06 0.18 -0.08 0.05 0.05 -0.07 0.07 -0.08 -0.03 1.00 0.05 0.03 0.02 0.01 0.11
PoolArea 0.06 0.01 0.18 0.08 0.07 -0.00 0.00 0.01 0.01 0.14 0.04 -0.04 0.13 0.13 0.08 0.06 0.17 0.07 0.02 0.05 0.02 0.07 -0.01 0.08 0.10 -0.01 0.02 0.06 0.07 0.06 0.05 -0.01 0.05 1.00 0.03 -0.03 -0.06 0.09
MiscVal -0.01 -0.01 0.00 0.04 -0.03 0.07 -0.03 -0.01 -0.03 0.00 0.00 -0.02 -0.02 -0.02 0.02 -0.00 -0.00 -0.02 -0.01 -0.01 0.00 0.01 0.06 0.02 0.00 -0.03 -0.04 -0.03 -0.01 -0.02 0.02 0.00 0.03 0.03 1.00 -0.01 0.00 -0.02
MoSold 0.02 -0.01 0.01 0.00 0.07 -0.00 0.01 0.02 -0.01 -0.02 -0.02 0.03 0.01 0.03 0.04 -0.02 0.05 -0.03 0.03 0.06 -0.01 0.05 0.03 0.04 0.05 0.01 0.04 0.03 0.02 0.07 -0.03 0.03 0.02 -0.03 -0.01 1.00 -0.15 0.05
YrSold 0.00 -0.02 0.01 -0.01 -0.03 0.04 -0.01 0.04 -0.01 0.01 0.03 -0.04 -0.01 -0.01 -0.03 -0.03 -0.04 0.07 -0.05 -0.02 -0.01 -0.04 0.03 -0.03 -0.02 -0.00 -0.04 -0.03 0.02 -0.06 -0.01 0.02 0.01 -0.06 0.00 -0.15 1.00 -0.03
SalePrice -0.02 -0.08 0.33 0.26 0.79 -0.08 0.52 0.51 0.48 0.39 -0.01 0.21 0.61 0.61 0.32 -0.03 0.71 0.23 -0.02 0.56 0.28 0.17 -0.14 0.53 0.47 0.47 0.64 0.62 0.32 0.32 -0.13 0.04 0.11 0.09 -0.02 0.05 -0.03 1.00
In [22]:
# Identifying significant variables using a correlation matrix----> seaborn
# plt.figure(figsize=(25,25))
# sns.heatmap(numeric_data.corr(),annot=True,cmap='Blues')

Plotting Pair Plot for numerical data

In [23]:
sns.pairplot(numeric_data,diag_kws={'color':'red'}, 
            plot_kws={'color':'green'})
plt.show()
In [24]:
# for i in numeric_data.columns:
#     sns.boxplot(numeric_data[i],color='Green')
#     plt.show()
In [25]:
# for i in numeric_data.columns:
    
#     q1=numeric_data[i].quantile(0.25)

#     q3=numeric_data[i].quantile(0.75)

#     iqr=q3-q1

#     UB= q3+1.5*iqr

#     LB =q1-1.5*iqr
    
#     numeric_data[i]=np.where(numeric_data[i]>UB, UB, numeric_data[i])

#     numeric_data[i]=np.where(numeric_data[i]<LB, LB, numeric_data[i])
    

Dealing with categorical data

In [39]:
categorical_data.head(3)
Out[39]:
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition
0 RL Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
1 RL Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr ... Attchd RFn TA TA Y NaN NaN NaN WD Normal
2 RL Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm ... Attchd RFn TA TA Y NaN NaN NaN WD Normal

3 rows × 43 columns

Identifying null values in categorical data

In [40]:
pd.isnull(categorical_data).sum()[pd.isnull(categorical_data).sum()>0]
Out[40]:
Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Obtaining values/variables which have nulls more than 60% and dropping them

In [41]:
# Below code gives percentage of null in every column
null_percentage = categorical_data.isnull().sum()/numeric_data.shape[0]*100

# Below code gives list of columns having more than 60% null
col_to_drop = null_percentage[null_percentage>60].keys()

output_df = categorical_data.drop(col_to_drop, axis=1)

Missing value treatment for categorical data (Mean)

In [42]:
from sklearn.impute import SimpleImputer
value = output_df.values
 
# defining the imputer
imputer = SimpleImputer(missing_values=np.nan ,strategy='most_frequent')
#for col in cdata:
transformed_values_cate_data = imputer.fit_transform(value)
In [43]:
type(transformed_values_cate_data)
Out[43]:
numpy.ndarray

Converting the ndarray(transformed_values_cate_data) to dataframe

In [44]:
transformed_values_cate_data=pd.DataFrame(transformed_values_cate_data)
transformed_values_cate_data
Out[44]:
0 1 2 3 4 5 6 7 8 9 ... 29 30 31 32 33 34 35 36 37 38
0 RL Pave Reg Lvl AllPub Inside Gtl CollgCr Norm Norm ... Gd Typ Gd Attchd RFn TA TA Y WD Normal
1 RL Pave Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm ... TA Typ TA Attchd RFn TA TA Y WD Normal
2 RL Pave IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm ... Gd Typ TA Attchd RFn TA TA Y WD Normal
3 RL Pave IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm ... Gd Typ Gd Detchd Unf TA TA Y WD Abnorml
4 RL Pave IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm ... Gd Typ TA Attchd RFn TA TA Y WD Normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 RL Pave Reg Lvl AllPub Inside Gtl Gilbert Norm Norm ... TA Typ TA Attchd RFn TA TA Y WD Normal
1456 RL Pave Reg Lvl AllPub Inside Gtl NWAmes Norm Norm ... TA Min1 TA Attchd Unf TA TA Y WD Normal
1457 RL Pave Reg Lvl AllPub Inside Gtl Crawfor Norm Norm ... Gd Typ Gd Attchd RFn TA TA Y WD Normal
1458 RL Pave Reg Lvl AllPub Inside Gtl mes Norm Norm ... Gd Typ Gd Attchd Unf TA TA Y WD Normal
1459 RL Pave Reg Lvl AllPub Inside Gtl Edwards Norm Norm ... TA Typ Gd Attchd Fin TA TA Y WD Normal

1460 rows × 39 columns

In [45]:
pd.isnull(transformed_values_cate_data).sum().sum()
# pd.isnull(transformed_values_cate_data).sum()[pd.isnull(transformed_values_cate_data).sum()>0]
Out[45]:
0
In [46]:
# col_add=output_df.columns
# col_add

Adding columns/variables to above dataframe

In [47]:
transformed_values_cate_data.columns=output_df.columns
In [48]:
# Viewing datast
transformed_values_cate_data.head(3)
Out[48]:
MSZoning Street LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 ... KitchenQual Functiol FireplaceQu GarageType GarageFinish GarageQual GarageCond PavedDrive SaleType SaleCondition
0 RL Pave Reg Lvl AllPub Inside Gtl CollgCr Norm Norm ... Gd Typ Gd Attchd RFn TA TA Y WD Normal
1 RL Pave Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm ... TA Typ TA Attchd RFn TA TA Y WD Normal
2 RL Pave IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm ... Gd Typ TA Attchd RFn TA TA Y WD Normal

3 rows × 39 columns

Plotting Count plot

In [49]:
 for col in transformed_values_cate_data:
     plt.figure(figsize=(7,5))
     sns.countplot(y=transformed_values_cate_data[col])
     plt.show()

Plotting box plot

In [50]:
Y = numeric_data['SalePrice']
for col in transformed_values_cate_data:
     plt.figure(figsize=(7,5))
     sns.boxplot(x=Y,y=transformed_values_cate_data[col],showcaps=False,
     flierprops={"marker": "x"},    
     boxprops={"facecolor": (.4, .6, .8, .5)},
     medianprops={"color": "coral"})
     plt.show()

Identifying significant variables using p-values and Chi-Square values

In [52]:
from scipy import stats
for i in transformed_values_cate_data.columns:
    x=pd.crosstab(transformed_values_cate_data[i],df.SalePrice)
    coeff,pval,dof,exp=stats.chi2_contingency(x.values)
    if pval<0.05:
        print('H1--Alternate------>Relation exists',i,df.columns[-1],str(pval))
    else:
        print('H0--Null Hypothsis---- No Relation',i,df.columns[-1],'Dropping the cloumns',str(pval))
        transformed_values_cate_data.drop(i,axis=1,inplace=True)
H1--Alternate------>Relation exists MSZoning SalePrice 4.3483250606822396e-11
H1--Alternate------>Relation exists Street SalePrice 8.338870380464053e-09
H1--Alternate------>Relation exists LotShape SalePrice 4.724729155980402e-12
H0--Null Hypothsis---- No Relation LandContour SalePrice Dropping the cloumns 0.08674645041917711
H0--Null Hypothsis---- No Relation Utilities SalePrice Dropping the cloumns 1.0
H1--Alternate------>Relation exists LotConfig SalePrice 0.045806211958033756
H0--Null Hypothsis---- No Relation LandSlope SalePrice Dropping the cloumns 0.10508638737793884
H1--Alternate------>Relation exists Neighborhood SalePrice 1.364960102688296e-08
H0--Null Hypothsis---- No Relation Condition1 SalePrice Dropping the cloumns 1.0
H0--Null Hypothsis---- No Relation Condition2 SalePrice Dropping the cloumns 0.07598640644469955
H0--Null Hypothsis---- No Relation BldgType SalePrice Dropping the cloumns 0.9999860714473023
H0--Null Hypothsis---- No Relation HouseStyle SalePrice Dropping the cloumns 0.6482615179447816
H0--Null Hypothsis---- No Relation RoofStyle SalePrice Dropping the cloumns 1.0
H0--Null Hypothsis---- No Relation RoofMatl SalePrice Dropping the cloumns 1.0
H0--Null Hypothsis---- No Relation Exterior1st SalePrice Dropping the cloumns 0.9999839433628513
H0--Null Hypothsis---- No Relation Exterior2nd SalePrice Dropping the cloumns 0.8469189757654344
H1--Alternate------>Relation exists MasVnrType SalePrice 1.597523579519887e-06
H1--Alternate------>Relation exists ExterQual SalePrice 4.250289171585687e-34
H1--Alternate------>Relation exists ExterCond SalePrice 9.869790306250171e-13
H1--Alternate------>Relation exists Foundation SalePrice 9.66452199704509e-06
H1--Alternate------>Relation exists BsmtQual SalePrice 1.8224981926842858e-22
H1--Alternate------>Relation exists BsmtCond SalePrice 3.2510088142893966e-13
H1--Alternate------>Relation exists BsmtExposure SalePrice 9.898247719408021e-08
H0--Null Hypothsis---- No Relation BsmtFinType1 SalePrice Dropping the cloumns 0.9663365392953103
H0--Null Hypothsis---- No Relation BsmtFinType2 SalePrice Dropping the cloumns 0.999999609869471
H1--Alternate------>Relation exists Heating SalePrice 2.477753304101386e-24
H0--Null Hypothsis---- No Relation HeatingQC SalePrice Dropping the cloumns 0.9995946871632676
H1--Alternate------>Relation exists CentralAir SalePrice 1.2257126695737677e-05
H0--Null Hypothsis---- No Relation Electrical SalePrice Dropping the cloumns 0.052835134010111195
H1--Alternate------>Relation exists KitchenQual SalePrice 1.2820744991685297e-31
H0--Null Hypothsis---- No Relation Functiol SalePrice Dropping the cloumns 1.0
H1--Alternate------>Relation exists FireplaceQu SalePrice 0.0003226273220519131
H0--Null Hypothsis---- No Relation GarageType SalePrice Dropping the cloumns 0.3800804730475992
H1--Alternate------>Relation exists GarageFinish SalePrice 1.5468634379037297e-12
H0--Null Hypothsis---- No Relation GarageQual SalePrice Dropping the cloumns 0.253478687441585
H0--Null Hypothsis---- No Relation GarageCond SalePrice Dropping the cloumns 1.0
H0--Null Hypothsis---- No Relation PavedDrive SalePrice Dropping the cloumns 0.9912799626956017
H1--Alternate------>Relation exists SaleType SalePrice 4.560785392696702e-14
H1--Alternate------>Relation exists SaleCondition SalePrice 5.613395680294345e-14
In [53]:
transformed_values_cate_data
Out[53]:
MSZoning Street LotShape LotConfig Neighborhood MasVnrType ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure Heating CentralAir KitchenQual FireplaceQu GarageFinish SaleType SaleCondition
0 RL Pave Reg Inside CollgCr BrkFace Gd TA PConc Gd TA No GasA Y Gd Gd RFn WD Normal
1 RL Pave Reg FR2 Veenker None TA TA CBlock Gd TA Gd GasA Y TA TA RFn WD Normal
2 RL Pave IR1 Inside CollgCr BrkFace Gd TA PConc Gd TA Mn GasA Y Gd TA RFn WD Normal
3 RL Pave IR1 Corner Crawfor None TA TA BrkTil TA Gd No GasA Y Gd Gd Unf WD Abnorml
4 RL Pave IR1 FR2 NoRidge BrkFace Gd TA PConc Gd TA Av GasA Y Gd TA RFn WD Normal
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 RL Pave Reg Inside Gilbert None TA TA PConc Gd TA No GasA Y TA TA RFn WD Normal
1456 RL Pave Reg Inside NWAmes Stone TA TA CBlock Gd TA No GasA Y TA TA Unf WD Normal
1457 RL Pave Reg Inside Crawfor None Ex Gd Stone TA Gd No GasA Y Gd Gd RFn WD Normal
1458 RL Pave Reg Inside mes None TA TA CBlock TA TA Mn GasA Y Gd Gd Unf WD Normal
1459 RL Pave Reg Inside Edwards None Gd TA CBlock TA TA No GasA Y TA Gd Fin WD Normal

1460 rows × 19 columns

Combining DataSets----> (numerical dataset and categorical dataset)

In [54]:
# numeric_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1460 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalfBath   1460 non-null   int64  
 19  FullBath       1460 non-null   int64  
 20  HalfBath       1460 non-null   int64  
 21  BedroomAbvGr   1460 non-null   int64  
 22  KitchebvGr     1460 non-null   int64  
 23  TotRmsAbvGrd   1460 non-null   int64  
 24  Fireplaces     1460 non-null   int64  
 25  GarageYrBlt    1460 non-null   float64
 26  GarageCars     1460 non-null   int64  
 27  GarageArea     1460 non-null   int64  
 28  WoodDeckSF     1460 non-null   int64  
 29  OpenPorchSF    1460 non-null   int64  
 30  EnclosedPorch  1460 non-null   int64  
 31  3SsnPorch      1460 non-null   int64  
 32  ScreenPorch    1460 non-null   int64  
 33  PoolArea       1460 non-null   int64  
 34  MiscVal        1460 non-null   int64  
 35  MoSold         1460 non-null   int64  
 36  YrSold         1460 non-null   int64  
 37  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35)
memory usage: 433.6 KB
In [55]:
# transformed_values_cate_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   LotShape       1460 non-null   object
 3   LotConfig      1460 non-null   object
 4   Neighborhood   1460 non-null   object
 5   MasVnrType     1460 non-null   object
 6   ExterQual      1460 non-null   object
 7   ExterCond      1460 non-null   object
 8   Foundation     1460 non-null   object
 9   BsmtQual       1460 non-null   object
 10  BsmtCond       1460 non-null   object
 11  BsmtExposure   1460 non-null   object
 12  Heating        1460 non-null   object
 13  CentralAir     1460 non-null   object
 14  KitchenQual    1460 non-null   object
 15  FireplaceQu    1460 non-null   object
 16  GarageFinish   1460 non-null   object
 17  SaleType       1460 non-null   object
 18  SaleCondition  1460 non-null   object
dtypes: object(19)
memory usage: 216.8+ KB
In [56]:
# fulldata_set=pd.concat([ndata,aa,data['SalePrice']],axis=1)
result_dataset = pd.concat([numeric_data, transformed_values_cate_data], axis=1)
result_dataset.head(3)
Out[56]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... BsmtQual BsmtCond BsmtExposure Heating CentralAir KitchenQual FireplaceQu GarageFinish SaleType SaleCondition
0 1 60 65.0 8450 7 5 2003 2003 196.0 706 ... Gd TA No GasA Y Gd Gd RFn WD Normal
1 2 20 80.0 9600 6 8 1976 1976 0.0 978 ... Gd TA Gd GasA Y TA TA RFn WD Normal
2 3 60 68.0 11250 7 5 2001 2002 162.0 486 ... Gd TA Mn GasA Y Gd TA RFn WD Normal

3 rows × 57 columns

Plot box plot for the new dataset

In [57]:
for i in result_dataset.columns:
    if result_dataset[i].dtypes=='int64' or result_dataset[i].dtypes=='float64':
        sns.boxplot(result_dataset[i])
        plt.show()
    else:
        plt.figure(figsize=(10,5))
        sns.boxplot(x=result_dataset['SalePrice'],y=result_dataset[i])
        plt.show()

Complete

In [ ]: